O dataset

Disponível em: http://insideairbnb.com/get-the-data.html Referencia para seguir: https://www.kaggle.com/josipdomazet/mining-nyc-airbnb-data-using-r

Download:

Importa o dataset

## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   id = col_double(),
##   scrape_id = col_double(),
##   last_scraped = col_date(format = ""),
##   thumbnail_url = col_logical(),
##   medium_url = col_logical(),
##   xl_picture_url = col_logical(),
##   host_id = col_double(),
##   host_since = col_date(format = ""),
##   host_is_superhost = col_logical(),
##   host_listings_count = col_double(),
##   host_total_listings_count = col_double(),
##   host_has_profile_pic = col_logical(),
##   host_identity_verified = col_logical(),
##   neighbourhood_group_cleansed = col_logical(),
##   latitude = col_double(),
##   longitude = col_double(),
##   is_location_exact = col_logical(),
##   accommodates = col_double(),
##   bathrooms = col_double(),
##   bedrooms = col_double()
##   # ... with 40 more columns
## )
## See spec(...) for full column specifications.
## Warning: 3 parsing failures.
##   row     col           expected             actual         file
##  1745 license 1/0/T/F/TRUE/FALSE +1512-6670366      <connection>
## 28567 license 1/0/T/F/TRUE/FALSE 56131/AL           <connection>
## 34253 license 1/0/T/F/TRUE/FALSE 05.557.336/0001-70 <connection>
## # A tibble: 35,451 x 17
##    name  host_name neighbourhood_c… latitude longitude property_type
##    <chr> <fct>     <fct>               <dbl>     <dbl> <fct>        
##  1 Very… Matthias  Copacabana          -23.0     -43.2 Condominium  
##  2 Beau… Viviane   Copacabana          -23.0     -43.2 Apartment    
##  3 NICE… Renata    Ipanema             -23.0     -43.2 Apartment    
##  4 Cosy… Patricia  Ipanema             -23.0     -43.2 Apartment    
##  5 COPA… Patricia… Copacabana          -23.0     -43.2 Loft         
##  6 Copa… Seba      Copacabana          -23.0     -43.2 Apartment    
##  7 Beac… Alex      Ipanema             -23.0     -43.2 Serviced apa…
##  8 Rio … Vana      Copacabana          -23.0     -43.2 Apartment    
##  9 4bed… Marcio    Copacabana          -23.0     -43.2 Apartment    
## 10 HUma… Marcio    Humaitá             -23.0     -43.2 Apartment    
## # … with 35,441 more rows, and 11 more variables: room_type <fct>,
## #   price <dbl>, accommodates <dbl>, bedrooms <dbl>, minimum_nights <dbl>,
## #   availability_365 <dbl>, number_of_reviews <dbl>,
## #   review_scores_rating <dbl>, cancellation_policy <fct>,
## #   require_guest_profile_picture <lgl>,
## #   require_guest_phone_verification <lgl>

Descrição dos dados

airbnb %>% summary()
##      name             host_name                  neighbourhood_cleansed
##  Length:35451       Daniel :  432   Copacabana              : 8825     
##  Class :character   Ricardo:  322   Barra da Tijuca         : 3908     
##  Mode  :character   Maria  :  315   Ipanema                 : 2970     
##                     Marcelo:  311   Jacarepaguá             : 1917     
##                     Mario  :  309   Botafogo                : 1767     
##                     (Other):33702   Recreio dos Bandeirantes: 1750     
##                     NA's   :   60   (Other)                 :14314     
##     latitude        longitude                 property_type  
##  Min.   :-23.07   Min.   :-43.74   Apartment         :27023  
##  1st Qu.:-22.98   1st Qu.:-43.32   House             : 3709  
##  Median :-22.97   Median :-43.20   Condominium       : 1618  
##  Mean   :-22.96   Mean   :-43.25   Serviced apartment:  903  
##  3rd Qu.:-22.94   3rd Qu.:-43.19   Loft              :  653  
##  Max.   :-22.75   Max.   :-43.10   Bed and breakfast :  281  
##                                    (Other)           : 1264  
##            room_type         price          accommodates    
##  Entire home/apt:25006   Min.   :    0.0   Min.   :  1.000  
##  Private room   : 9586   1st Qu.:  150.0   1st Qu.:  2.000  
##  Shared room    :  859   Median :  281.0   Median :  4.000  
##                          Mean   :  622.2   Mean   :  4.175  
##                          3rd Qu.:  599.0   3rd Qu.:  5.000  
##                          Max.   :40000.0   Max.   :160.000  
##                                                             
##     bedrooms      minimum_nights     availability_365 number_of_reviews
##  Min.   : 0.000   Min.   :   1.000   Min.   :  0.0    Min.   :  0.000  
##  1st Qu.: 1.000   1st Qu.:   1.000   1st Qu.:  0.0    1st Qu.:  0.000  
##  Median : 1.000   Median :   2.000   Median :179.0    Median :  1.000  
##  Mean   : 1.637   Mean   :   4.736   Mean   :190.1    Mean   :  7.952  
##  3rd Qu.: 2.000   3rd Qu.:   4.000   3rd Qu.:362.0    3rd Qu.:  5.000  
##  Max.   :22.000   Max.   :1123.000   Max.   :365.0    Max.   :350.000  
##  NA's   :23                                                            
##  review_scores_rating                  cancellation_policy
##  Min.   : 20.00       flexible                   :15552   
##  1st Qu.: 93.00       moderate                   : 5757   
##  Median : 98.00       strict                     :    2   
##  Mean   : 94.35       strict_14_with_grace_period:13589   
##  3rd Qu.:100.00       super_strict_30            :  163   
##  Max.   :100.00       super_strict_60            :  388   
##  NA's   :17455                                            
##  require_guest_profile_picture require_guest_phone_verification
##  Mode :logical                 Mode :logical                   
##  FALSE:34873                   FALSE:34850                     
##  TRUE :578                     TRUE :601                       
##                                                                
##                                                                
##                                                                
## 
glimpse(airbnb)
## Observations: 35,451
## Variables: 17
## $ name                             <chr> "Very Nice 2Br - Copacabana - W…
## $ host_name                        <fct> Matthias, Viviane, Renata, Patr…
## $ neighbourhood_cleansed           <fct> Copacabana, Copacabana, Ipanema…
## $ latitude                         <dbl> -22.96592, -22.97712, -22.98302…
## $ longitude                        <dbl> -43.17896, -43.19045, -43.21427…
## $ property_type                    <fct> Condominium, Apartment, Apartme…
## $ room_type                        <fct> Entire home/apt, Entire home/ap…
## $ price                            <dbl> 296, 161, 243, 337, 221, 150, 3…
## $ accommodates                     <dbl> 5, 3, 3, 3, 2, 2, 13, 1, 11, 3,…
## $ bedrooms                         <dbl> 2, 1, 1, 1, 1, 1, 6, 1, 4, 1, 1…
## $ minimum_nights                   <dbl> 4, 4, 2, 2, 3, 2, 2, 3, 4, 5, 3…
## $ availability_365                 <dbl> 332, 352, 125, 122, 145, 89, 29…
## $ number_of_reviews                <dbl> 233, 232, 260, 160, 303, 1, 54,…
## $ review_scores_rating             <dbl> 93, 94, 96, 94, 98, NA, 91, 98,…
## $ cancellation_policy              <fct> strict_14_with_grace_period, st…
## $ require_guest_profile_picture    <lgl> FALSE, TRUE, FALSE, TRUE, FALSE…
## $ require_guest_phone_verification <lgl> FALSE, TRUE, FALSE, TRUE, TRUE,…

Missing Data

Missing antes de remover

missing_airbnb <- summarise_all(airbnb, ~sum(is.na(.))) 
missing_airbnb <- gather(missing_airbnb, key = "variables", value = "missing")
missing_airbnb %>% filter(missing > 0)
## # A tibble: 4 x 2
##   variables            missing
##   <chr>                  <int>
## 1 name                      66
## 2 host_name                 60
## 3 bedrooms                  23
## 4 review_scores_rating   17455
# Remove sem reviews
airbnb <- airbnb %>% filter(number_of_reviews != 0)
# Remove preço 0
airbnb <- airbnb %>% filter(price != 0)
# Remove os NA
airbnb <- airbnb %>% drop_na(review_scores_rating)
airbnb <- airbnb %>% drop_na(bedrooms)

glimpse(airbnb)
## Observations: 17,979
## Variables: 17
## $ name                             <chr> "Very Nice 2Br - Copacabana - W…
## $ host_name                        <fct> Matthias, Viviane, Renata, Patr…
## $ neighbourhood_cleansed           <fct> Copacabana, Copacabana, Ipanema…
## $ latitude                         <dbl> -22.96592, -22.97712, -22.98302…
## $ longitude                        <dbl> -43.17896, -43.19045, -43.21427…
## $ property_type                    <fct> Condominium, Apartment, Apartme…
## $ room_type                        <fct> Entire home/apt, Entire home/ap…
## $ price                            <dbl> 296, 161, 243, 337, 221, 3250, …
## $ accommodates                     <dbl> 5, 3, 3, 3, 2, 13, 1, 11, 4, 6,…
## $ bedrooms                         <dbl> 2, 1, 1, 1, 1, 6, 1, 4, 1, 3, 4…
## $ minimum_nights                   <dbl> 4, 4, 2, 2, 3, 2, 3, 4, 3, 2, 1…
## $ availability_365                 <dbl> 332, 352, 125, 122, 145, 298, 3…
## $ number_of_reviews                <dbl> 233, 232, 260, 160, 303, 54, 40…
## $ review_scores_rating             <dbl> 93, 94, 96, 94, 98, 91, 98, 80,…
## $ cancellation_policy              <fct> strict_14_with_grace_period, st…
## $ require_guest_profile_picture    <lgl> FALSE, TRUE, FALSE, TRUE, FALSE…
## $ require_guest_phone_verification <lgl> FALSE, TRUE, FALSE, TRUE, TRUE,…

Missing após remover

missing_airbnb <- summarise_all(airbnb, ~sum(is.na(.))) 
missing_airbnb <- gather(missing_airbnb, key = "variables", value = "missing")
missing_airbnb %>% filter(missing > 0)
## # A tibble: 2 x 2
##   variables missing
##   <chr>       <int>
## 1 name            2
## 2 host_name      40

Visualização

Bairros

n_bairros <- 7 

bairros <- airbnb %>% 
  group_by(neighbourhood_cleansed) %>% 
  tally(sort=TRUE) %>%
  group_by(bairro = factor(c(
    as.character(neighbourhood_cleansed[1:n_bairros]), rep("Outros", n() - n_bairros)),
    levels = c(as.character(neighbourhood_cleansed[1:n_bairros]), "Outros"))) %>%
  tally(n) 

bairros %>%
  ggplot(aes(bairro, n, fill=bairro)) +
  geom_bar(stat="identity") +
  geom_text(aes(label=n), vjust=-0.4, size=3.5) +
  theme(legend.position = "none") +
  xlab("Bairro") +
  ylab("Frquência")

Tipo de quarto

ggplot(airbnb, aes(x=room_type, fill=room_type)) + 
  geom_bar() + 
  geom_text(stat='count', aes(label=..count..), vjust=-0.4, size=3.5)

Tipo de propriedade

n_tipos <- 6

tipos_propriedade <- airbnb %>% 
  group_by(property_type) %>% 
  tally(sort=TRUE) %>%
  group_by(tipo_propriedade = factor(c(
    as.character(property_type[1:n_tipos]), rep("Outros", n() - n_tipos)),
    levels = c(as.character(property_type[1:n_tipos]), "Outros"))) %>%
  tally(n) 

tipos_propriedade %>%
  ggplot(aes(tipo_propriedade, n, fill=tipo_propriedade)) +
  geom_bar(stat="identity") +
  geom_text(aes(label=n), vjust=-0.4, size=3.5) +
  xlab("Tipo de propriedade") +
  ylab("Frequência") +
  theme(axis.text = element_blank())

Política de cancelamento

airbnb %>%
  group_by(cancellation_policy) %>%
  tally(sort=TRUE) %>%
  ggplot(aes(x=reorder(cancellation_policy, -n), y=n, fill=reorder(cancellation_policy, -n))) + 
  geom_bar(stat="identity") + 
  geom_text(aes(label=n), vjust=-0.4, size=3.5) +
  theme(axis.text.x = element_blank()) +
  xlab("Política de cancelamento") +
  ylab("Frequência") +
  labs(fill="Política de cancelamento")

Requer foto de perfil do hóspede

# Tema para pie charts
blank_theme <- theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    axis.text.x=element_blank(),
    panel.border = element_blank(),
    panel.grid=element_blank(),
    axis.ticks = element_blank(),
    plot.title=element_text(size=14, face="bold") 
  )

airbnb %>%
  ggplot(aes(x="", fill=require_guest_profile_picture)) +
  geom_bar(width=1) +
  coord_polar("y", start=0) +
  blank_theme + 
  geom_text(stat='count',aes(label=..count..), position = position_stack(vjust = 0.5), color="white") +
  labs(fill="") +
  ggtitle("Requer foto de perfil do hóspede")

Requer que o hóspede tenha telefone verificado

airbnb %>%
  ggplot(aes(x="", fill=require_guest_phone_verification)) +
  geom_bar(width=1) +
  coord_polar("y", start=0) +
  blank_theme + 
  geom_text(stat='count',aes(label=..count..), position = position_stack(vjust = 0.5), color="white") +
  labs(fill="") +
  ggtitle("Requer que o hóspede tenha telefone verificado")

Accommodates

airbnb$accommodates %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   4.000   3.988   5.000 160.000
airbnb %>% 
  filter(accommodates < 50) %>%
  ggplot(aes(x = "", y = accommodates)) +
  geom_boxplot(fill="purple") + 
  xlab("") + 
  ylab("Número") +
  ggtitle("Quantidade máxima de pessoas acomodadas",
          subtitle = "Removido um valor 160") +
  theme(legend.position = "none")

airbnb %>% 
  filter(accommodates < 50) %>%
  ggplot(aes(accommodates)) +
  geom_histogram(bins = 31, fill="purple") + 
  xlab("") + 
  ylab("Frequência") +
  ggtitle("Quantidade máxima de pessoas acomodadas",
          subtitle = "Removido um valor 160") +
  theme(legend.position = "none")

Quartos

airbnb$bedrooms %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   1.000   1.511   2.000  13.000
airbnb %>% 
  ggplot(aes(x = "", y = bedrooms)) +
  geom_boxplot(fill="lightblue") + 
  xlab("") + 
  ylab("Número") +
  ggtitle("Número de quartos") +
  theme(legend.position = "none")

airbnb$bedrooms %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   1.000   1.511   2.000  13.000
airbnb %>% 
  ggplot(aes(bedrooms)) +
  geom_histogram(bins=28, fill="lightblue") + 
  xlab("Quartos") + 
  ylab("Frequência") +
  ggtitle("Número de quartos") +
  theme(legend.position = "none")

Número mínimo de noites

airbnb$minimum_nights %>% summary()
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    1.000    2.000    2.000    3.698    3.000 1123.000
airbnb %>% 
  ggplot(aes(x = "", y = minimum_nights)) +
  geom_boxplot(fill="lightblue") + 
  scale_y_log10() + 
  xlab("") + 
  ylab("Noites") +
  ggtitle("Número mínimo de noites", subtitle = "Com escala logarítmica") +
  theme(legend.position = "none")

airbnb %>% 
#  filter(minimum_nights < 50) %>%
  ggplot(aes(minimum_nights)) +
  geom_histogram(bins=28, fill="lightgreen") + 
  xlab("Noites") + 
  ylab("Frequência") +
  ggtitle("Número mínimo de noites") +
  theme(legend.position = "none")

airbnb %>% 
  filter(minimum_nights <= 20) %>%
  ggplot(aes(minimum_nights)) +
  geom_histogram(bins=19, fill="lightgreen") + 
  xlab("Noites") + 
  ylab("Frequência") +
  ggtitle("Número mínimo de noites", subtitle = "Removidos valores maiores que 20") +
  theme(legend.position = "none")

Disponibilidade 365

airbnb$availability_365 %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    68.0   180.0   195.1   349.0   365.0
airbnb %>% 
  ggplot(aes(x = "", y = availability_365)) +
  geom_boxplot(fill="blue") + 
  xlab("") + 
  ylab("Dias") +
  ggtitle("Disponibilidade 365") +
  theme(legend.position = "none")

airbnb %>% 
  ggplot(aes(availability_365)) +
  geom_histogram(bins=28, fill="blue") + 
  xlab("Dias") + 
  ylab("Frequência") +
  ggtitle("Disponibilidade 365") +
  theme(legend.position = "none")

Número de avaliações

airbnb$number_of_reviews %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    2.00    4.00   15.61   15.00  350.00
airbnb %>% 
  ggplot(aes(x = "", y = number_of_reviews)) +
  geom_boxplot(fill="pink") + 
  scale_y_log10() +
  xlab("") + 
  ylab("Avaliações") +
  ggtitle("Número de avaliações", subtitle = "Com escala logaritmica") +
  theme(legend.position = "none")

airbnb %>% 
  filter(number_of_reviews <= 150) %>% 
  ggplot(aes(number_of_reviews)) +
  geom_histogram(bins=30, fill="pink") + 
  xlab("Avaliações") + 
  ylab("Frequência") +
  ggtitle("Número de avaliações", subtitle = "Filtrados valores acima de 150") +
  theme(legend.position = "none")

Avaliação

airbnb$review_scores_rating %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20.00   93.00   98.00   94.35  100.00  100.00
airbnb %>% 
  ggplot(aes(x = "", y = review_scores_rating)) +
  geom_boxplot(fill="#18d9cc") + 
  xlab("") + 
  ylab("Avaliações") +
  ggtitle("Número de avaliações") +
  theme(legend.position = "none")

airbnb %>% 
  ggplot(aes(review_scores_rating)) +
  geom_histogram(bins=50, fill="#18d9cc") + 
  xlab("Avaliação") + 
  ylab("Frequência") +
  ggtitle("Avaliação") +
  theme(legend.position = "none")

airbnb %>% 
  filter(review_scores_rating < 100) %>%
  ggplot(aes(review_scores_rating)) +
  geom_histogram(bins=50, fill="#18d9cc") + 
  xlab("Avaliação") + 
  ylab("Frequência") +
  ggtitle("Avaliação", subtitle = "Removidas notas 100") +
  theme(legend.position = "none")

airbnb %>%
  mutate(notamaxima = (review_scores_rating == 100)) %>%
  group_by(notamaxima) %>%
  ggplot(aes(x="", fill=notamaxima)) +
  geom_bar(width=1) +
  coord_polar("y", start=0) +
  blank_theme + 
  geom_text(stat="count", aes(label=..count..), position=position_stack(vjust = 0.5), color="white") +
  labs(fill="Nota máxima") +
  ggtitle("Nota máxima (100)")

Preço

ggplot(airbnb, aes(price, fill=room_type)) +
  geom_histogram(bins = 30) + 
  #geom_density(alpha = 0.2, fill = "purple") +
  ggtitle("Distribução de preço",
          subtitle = "A distribuição é muito inclinada") +
  theme(axis.title = element_text(), axis.title.x = element_text())

  #geom_vline(xintercept = round(mean(airbnb$price), 2), size = 2, linetype = 3)
ggplot(airbnb, aes(price, fill=room_type)) +
  geom_histogram(bins = 30) + 
  ggtitle("Distribuição transformada do preço",
          subtitle = expression("Com uma transformação" ~'log'[10] ~ "do eixo x")) +
  #theme(axis.title = element_text(), axis.title.x = element_text()) +
  #geom_vline(xintercept = round(mean(airbnb$price), 2), size = 2, linetype = 3) +
  scale_x_log10()

  #annotate("text", x = 1800, y = 0.75,label = paste("Mean price = ", paste0(round(mean(airbnb$price), 2), "$")),
  #         color =  "#32CD32", size = 8)
ggplot(airbnb, aes(price, fill=room_type)) +
  geom_histogram(bins = 30, aes(y = ..density..), show.legend = FALSE) +
  facet_wrap(~room_type) +  
  scale_x_log10() 

ggplot(airbnb, aes(x = room_type, y = price)) +
  geom_boxplot(aes(fill = room_type)) + scale_y_log10() +
  xlab("Tipo de quarto") + 
  ylab("Preço") +
  ggtitle("Boxplots de preço por tipo de quarto") +
  geom_hline(yintercept = mean(airbnb$price), color = "purple", linetype = 2) +
  theme(legend.position = "none")

Correlação

library(corrplot)
## corrplot 0.84 loaded
airbnb_cor <- airbnb[, sapply(airbnb, is.numeric)]
airbnb_cor <- airbnb_cor[complete.cases(airbnb_cor), ]
correlation_matrix <- cor(airbnb_cor, method = "spearman")
corrplot(correlation_matrix, method = "color")

Mapa com as listagens

pal <- colorFactor(palette = c("red", "green", "blue", "purple", "yellow"), domain = airbnb$room_type)

leaflet(data = airbnb) %>% 
  addProviderTiles(providers$CartoDB.DarkMatterNoLabels) %>% 
  addCircleMarkers(~longitude, 
    ~latitude, 
    color=~pal(room_type), 
    weight = 1, 
    radius=1, 
    fillOpacity = 0.1, 
    opacity = 1,
    label = paste("Name:", airbnb$name)) %>% addLegend("bottomright", pal = pal, values = ~room_type,
            title = "Room types",
            opacity = 1)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly(airbnb, x = ~longitude, y = ~latitude, z = ~price, color = ~room_type) 
## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode